In [ ]:
In [32]:
from collections import defaultdict, OrderedDict
import warnings
import gffutils
import pybedtools
import pandas as pd
import copy
import re
from gffutils.pybedtools_integration import tsses
from copy import deepcopy
from collections import OrderedDict, Callable
class DefaultOrderedDict(OrderedDict):
# Source: http://stackoverflow.com/a/6190500/562769
def __init__(self, default_factory=None, *a, **kw):
if (default_factory is not None and
not isinstance(default_factory, Callable)):
raise TypeError('first argument must be callable')
OrderedDict.__init__(self, *a, **kw)
self.default_factory = default_factory
def __getitem__(self, key):
try:
return OrderedDict.__getitem__(self, key)
except KeyError:
return self.__missing__(key)
def __missing__(self, key):
if self.default_factory is None:
raise KeyError(key)
self[key] = value = self.default_factory()
return value
def __reduce__(self):
if self.default_factory is None:
args = tuple()
else:
args = self.default_factory,
return type(self), args, None, None, self.items()
def copy(self):
return self.__copy__()
def __copy__(self):
return type(self)(self.default_factory, self)
def __deepcopy__(self, memo):
import copy
return type(self)(self.default_factory,
copy.deepcopy(self.items()))
def __repr__(self):
return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,
OrderedDict.__repr__(self))
In [26]:
gencode_v25 = '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/gencode.v25.annotation.gtf'
gencode_v25_db = '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/gencode.v25.annotation.gtf.db'
prefix = '/home/cmb-panasas2/skchoudh/genomes/hg38/annotation/gencode.v25.gffutils'
old_prefix = '/home/cmb-panasas2/skchoudh/github_projects/gencode_regions/data/GRCh37/v25/'
hg38_chrsizes = '/home/cmb-panasas2/skchoudh/genomes/hg38/fasta/hg38.chrom.sizes'
In [33]:
#db = gffutils.create_db(gencode_v25, dbfn=gencode_v25_db, merge_strategy='merge',
# disable_infer_genes=True, disable_infer_transcripts=True, force=True)
def create_gene_dict(db):
'''
Store each feature line db.all_features() as a dict of dicts
'''
gene_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
for line_no, feature in enumerate(db.all_features()):
gene_ids = feature.attributes['gene_id']
feature_type = feature.featuretype
if feature_type == 'gene':
if len(gene_ids)!=1:
logging.warning('Found multiple gene_ids on line {} in gtf'.format(line_no))
break
else:
gene_id = gene_ids[0]
gene_dict[gene_id]['gene'] = feature
else:
transcript_ids = feature.attributes['transcript_id']
for gene_id in gene_ids:
for transcript_id in transcript_ids:
gene_dict[gene_id][transcript_id][feature_type].append(feature)
return gene_dict
In [34]:
db = gffutils.FeatureDB(gencode_v25_db, keep_order=True)
gene_dict = create_gene_dict(db)
In [35]:
def get_gene_list(gene_dict):
return list(set(gene_dict.keys()))
def get_UTR_regions(utrs, cds):
if len(cds)==0:
return [], []
utr5_regions = []
utr3_regions = []
cds_sorted = sorted(list(cds), key=lambda x: x.start)
first_cds = cds_sorted[0]
last_cds = cds_sorted[-1]
for orig_utr in utrs:
utr = deepcopy(orig_utr)
## Push all cds at once
## Sort later to remove duplicates
strand = utr.strand
if utr.start < first_cds.start:
if utr.stop >= first_cds.start:
utr.stop = first_cds.start - 1
if strand == '+':
utr5_regions.append(utr)
else:
utr3_regions.append(utr)
elif utr.stop > last_cds.stop:
if utr.start <= last_cds.stop:
utr.start = last_cds.stop + 1
if strand == '+':
utr3_regions.append(utr)
else:
utr5_regions.append(utr)
"""
if strand == '+':
if utr.stop < first_cds.start:
utr.feature_type = 'five_prime_UTR'
utr5_regions.append(utr)
elif utr.start > last_cds.stop:
utr.feature_type = 'three_prime_UTR'
utr3_regions.append(utr)
elif utr.start < first_cds.start:
utr.feature_type = 'five_prime_UTR'
utr.stop = first_cds.start - 1
utr5_regions.append(utr)
elif utr.stop >= last_cds.stop:
utr.feature_type = 'three_prime_UTR'
utr.start = last_cds.stop + 1
utr3_regions.append(utr)
else:
#print('Error: strand + | first cds start: {} | last cds stop: {} | UTR start : {} | UTR stop : {}'.format(first_cds.start,
# last_cds.stop,
# utr.start,
# utr.stop))
continue
elif strand == '-':
if utr.stop < first_cds.start:
utr.feature_type = 'three_prime_UTR'
utr3_regions.append(utr)
elif utr.start > last_cds.stop:
utr.feature_type = 'five_prime_UTR'
utr5_regions.append(utr)
elif utr.start < first_cds.start:
utr.feature_type = 'three_prime_UTR'
utr.stop = first_cds.start - 1
utr3_regions.append(utr)
elif utr.stop >= last_cds.stop:
utr.feature_type = 'five_prime_UTR'
utr.start = last_cds.stop + 1
utr5_regions.append(utr)
else:
# This UTR is not a true UTR
#print('Error: strand - | first cds start: {} | last cds stop: {} | UTR start : {} | UTR stop : {}'.format(first_cds.start,
# last_cds.stop,
# utr.start,
# utr.stop))
#
continue
"""
return utr5_regions, utr3_regions
def create_bed(regions, bedtype='0'):
'''Create bed from list of regions
bedtype: 0 or 1
0-Based or 1-based coordinate of the BED
'''
bedstr = ''
for region in regions:
assert len(region.attributes['gene_id']) == 1
## GTF start is 1-based, so shift by one while writing
## to 0-based BED format
if bedtype == '0':
start = region.start - 1
else:
start = region.start
bedstr += '{}\t{}\t{}\t{}\t{}\t{}\n'.format(region.chrom,
start,
region.stop,
re.sub('\.\d+', '', region.attributes['gene_id'][0]),
'.',
region.strand)
return bedstr
def rename_regions(regions, gene_id):
regions = list(regions)
if len(regions) == 0:
return []
for region in regions:
region.attributes['gene_id'] = gene_id
return regions
def merge_regions(db, regions):
if len(regions) == 0:
return []
merged = db.merge(sorted(list(regions), key=lambda x: x.start))
return merged
def merge_regions_nostrand(db, regions):
if len(regions) == 0:
return []
merged = db.merge(sorted(list(regions), key=lambda x: x.start), ignore_strand=True)
return merged
In [36]:
utr5_bed = ''
utr3_bed = ''
gene_bed = ''
exon_bed = ''
intron_bed = ''
start_codon_bed = ''
stop_codon_bed = ''
cds_bed = ''
gene_list = []
for gene_id in get_gene_list(gene_dict):
gene_list.append(gene_dict[gene_id]['gene'])
utr5_regions, utr3_regions = [], []
exon_regions, intron_regions = [], []
star_codon_regions, stop_codon_regions = [], []
cds_regions = []
utr_regions = []
for feature in gene_dict[gene_id].keys():
if feature == 'gene':
continue
cds = list(gene_dict[gene_id][feature]['CDS'])
exons = list(gene_dict[gene_id][feature]['exon'])
utrs = list(gene_dict[gene_id][feature]['UTR'])
cds = sorted(list(cds), key=lambda x: x.start)
exons = sorted(list(exons), key=lambda x: x.start)
utrs = sorted(list(utrs), key=lambda x: x.start)
merged_exons = merge_regions(db, exons)
introns = db.interfeatures(merged_exons)
exon_regions += exons
intron_regions += introns
cds_regions += cds
utr_regions += utrs
cds_regions = sorted(list(cds_regions), key=lambda x: x.start)
utr_regions = sorted(list(utr_regions), key=lambda x: x.start)
utr5_regions, utr3_regions = get_UTR_regions(utr_regions, cds_regions)
merged_utr5 = merge_regions(db, utr5_regions)
renamed_utr5 = rename_regions(merged_utr5, gene_id)
merged_utr3 = merge_regions(db, utr3_regions)
renamed_utr3 = rename_regions(merged_utr3, gene_id)
##merged_exons = merge_regions(db, exon_regions)
##renamed_exons = rename_regions(merged_exons, gene_id)
##merged_introns = merge_regions(db, intron_regions)
##renamed_introns = rename_regions(merged_introns, gene_id)
merged_cds = merge_regions(db, cds_regions)
renamed_cds = rename_regions(merged_cds, gene_id)
utr3_bed += create_bed(renamed_utr3)
utr5_bed += create_bed(renamed_utr5)
##exon_bed += create_bed(renamed_exons)
##intron_bed += create_bed(renamed_introns)
cds_bed += create_bed(renamed_cds)
gene_bed = create_bed(gene_list)
gene_bedtool = pybedtools.BedTool(gene_bed, from_string=True)
utr5_bedtool = pybedtools.BedTool(utr5_bed, from_string=True)
utr3_bedtool = pybedtools.BedTool(utr3_bed, from_string=True)
##exon_bedtool = pybedtools.BedTool(exon_bed, from_string=True)
##intron_bedtool = pybedtools.BedTool(intron_bed, from_string=True)
cds_bedtool = pybedtools.BedTool(cds_bed, from_string=True)
gene_bedtool.remove_invalid().sort().saveas('{}.genes_all.bed'.format(prefix))
utr5_bedtool.remove_invalid().sort().saveas('{}.UTR5_all.bed'.format(prefix))
utr3_bedtool.remove_invalid().sort().saveas('{}.UTR3_all.bed'.format(prefix))
##exon_bedtool.remove_invalid().sort().saveas('{}.exon_all.bed'.format(prefix))
##intron_bedtool.remove_invalid().sort().saveas('{}.intron_all.bed'.format(prefix))
cds_bedtool.remove_invalid().sort().saveas('{}.cds_all.bed'.format(prefix))
Out[36]:
In [37]:
# Remove cds from utr5 and utr3
# We trust teh CDS coordinates always, so just chuck off the overlapping UTR coordinates
utr5_all = pybedtools.BedTool('{}.UTR5.bed'.format(prefix))
utr3_all = pybedtools.BedTool('{}.UTR3.bed'.format(prefix))
cds_bed = pybedtools.BedTool('{}.cds.bed'.format(prefix))
"""
utr5_cds_subtracted = utr5_all.subtract(cds_bed)
utr3_cds_subtracted = utr3_all.subtract(cds_bed)
utr5_cds_subtracted.remove_invalid().sort().saveas('{}.UTR5.bed'.format(prefix))
utr3_cds_subtracted.remove_invalid().sort().saveas('{}.UTR3.bed'.format(prefix))
"""
Out[37]:
In [ ]:
for gene_id in get_gene_list(gene_dict):
start_codons = []
stop_codons = []
for start_codon in db.children(gene_id, featuretype='start_codon'):
## 1 -based stop
## 0-based start handled while converting to bed
start_codon.stop = start_codon.start
start_codons.append(start_codon)
for stop_codon in db.children(gene_id, featuretype='stop_codon'):
stop_codon.start = stop_codon.stop
stop_codon.stop = stop_codon.stop+1
stop_codons.append(stop_codon)
merged_start_codons = merge_regions(db, start_codons)
renamed_start_codons = rename_regions(merged_start_codons, gene_id)
merged_stop_codons = merge_regions(db, stop_codons)
renamed_stop_codons = rename_regions(merged_stop_codons, gene_id)
start_codon_bed += create_bed(renamed_start_codons)
stop_codon_bed += create_bed(renamed_stop_codons)
start_codon_bedtool = pybedtools.BedTool(start_codon_bed, from_string=True)
stop_codon_bedtool = pybedtools.BedTool(stop_codon_bed, from_string=True)
start_codon_bedtool.remove_invalid().sort().saveas('{}.start_codon.bed'.format(prefix))
stop_codon_bedtool.remove_invalid().sort().saveas('{}.stop_codon.bed'.format(prefix))
In [ ]:
In [ ]:
## TSS
polyA_sites_bed = ''
tss_sites_bed = ''
for gene_id in get_gene_list(gene_dict):
tss_sites = []
polyA_sites = []
for transcript in db.children(gene_id, featuretype='transcript'):
start_t = copy.deepcopy(transcript)
stop_t = copy.deepcopy(transcript)
start_t.stop = start_t.start + 1
stop_t.start = stop_t.stop
if transcript.strand == '-':
start_t, stop_t = stop_t, start_t
polyA_sites.append(start_t)
tss_sites.append(stop_t)
merged_polyA_sites = merge_regions(db, polyA_sites)
renamed_polyA_sites = rename_regions(merged_polyA_sites, gene_id)
merged_tss_sites = merge_regions(db, tss_sites)
renamed_tss_sites = rename_regions(merged_tss_sites, gene_id)
polyA_sites_bed += create_bed(renamed_polyA_sites)
tss_sites_bed += create_bed(renamed_tss_sites)
polyA_sites_bedtool = pybedtools.BedTool(polyA_sites_bed, from_string=True)
tss_sites_bedtool = pybedtools.BedTool(tss_sites_bed, from_string=True)
polyA_sites_bedtool.remove_invalid().sort().saveas('{}.polyA_sites.bed'.format(prefix))
tss_sites_bedtool.remove_invalid().sort().saveas('{}.tss_sites.bed'.format(prefix))
In [ ]:
tss = tsses(db, as_bed6=True, merge_overlapping=True)
tss.remove_invalid().sort().saveas('{}.tss_temp.bed'.format(prefix))
promoter = tss.slop(l=1000, r=1000, s=True, g=hg38_chrsizes)
promoter.remove_invalid().sort().saveas('{}.promoter.1000.bed'.format(prefix))
In [ ]:
for l in [1000, 2000, 3000, 4000, 5000]:
promoter = tss.slop(l=l, r=l, s=True, g=hg38_chrsizes)
promoter.remove_invalid().sort().saveas('{}.promoter.{}.bed'.format(prefix, l))
In [ ]:
for x in db.featuretypes():
print(x)
In [ ]:
tRNA_sites = []
tRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
for transcript in db.children(gene_id, featuretype='transcript'):
if 'tRNA' in transcript.attributes['gene_type'] or 'Mt_tRNA' in transcript.attributes['gene_type']:
tRNA_sites.append(transcript)
#merged_tRNA_sites = merge_regions_nostrand(db, tRNA_sites)
#renamed_tRNA_sites = rename_regions(merged_tRNA_sites, gene_id)
tRNA_bed += create_bed(tRNA_sites)
tRNA_sites_bedtool = pybedtools.BedTool(tRNA_bed, from_string=True)
tRNA_sites_bedtool.remove_invalid().sort().saveas('{}.tRNA_sites.bed'.format(prefix))
In [ ]:
tRNA_sites_bedtool.to_dataframe()
In [ ]:
rRNA_sites = []
rRNA_bed = ''
for gene_id in get_gene_list(gene_dict):
for transcript in db.children(gene_id, featuretype='transcript'):
if 'rRNA' in transcript.attributes['gene_type']:
rRNA_sites.append(transcript)
#renamed_rRNA_sites = rename_regions(rRNA_sites, gene_id)
rRNA_bed += create_bed(rRNA_sites)
rRNA_sites_bedtool = pybedtools.BedTool(rRNA_bed, from_string=True)
rRNA_sites_bedtool.remove_invalid().sort().saveas('{}.rRNA_sites.bed'.format(prefix))
In [ ]:
rRNA_sites_bedtool.to_dataframe()
In [ ]:
gene_id = 'ENSG00000279457.3'
for feature in gene_dict[gene_id].keys():
if feature == 'gene':
continue
cds = list(gene_dict[gene_id][feature]['CDS'])
cds = sorted(list(cds), key=lambda x: x.start)
for c in cds:
print('CDS: {} {}'.format(c.start, c.stop))
#exons = list(gene_dict[gene_id][feature]['exon'])
#merged_exons = merge_regions(db, exons)
introns = db.interfeatures(merged_exons)
#utr5_region, utr3_region = get_UTR_regions(gene_dict, gene_id, feature, cds)
utrs = gene_dict[gene_id][feature]['UTR']
for utr in utrs:
print('UTR : {} {}'.format(utr.start, utr.stop))
print ('#######################')
In [ ]: